rm(list=ls())
require(quanteda)
require(LSX) # v1.5.0
require(wordvector)
require(quanteda.textmodels)
library(ggplot2)
library(furrr)
library(purrr)

quanteda_options(threads = 60)
source("functions.R")

plan(multisession, workers = 32)

token_dir <- "data/"
token_files <- list.files(token_dir, pattern = "^tokens_.*\\.RDS$", full.names = TRUE)

toks_list <- future_map(token_files, function(f) {
  toks <- readRDS(f)
  tokens_remove(toks, "\\d", valuetype = "regex", min_nchar = 2)
})

toks <- do.call(c, toks_list)

# targets and seeds setting
term <- char_context(toks, dict["target"], window = 10)
seed <- as.seedwords(dict$seeds)

# word2vec and LSS modeling
wdv <- textmodel_word2vec(
  toks,
  dim = 100,
  type = "skip-gram",
  normalize = FALSE,
  window = 5,
  min_count = 5,
  iter = 5,
  threads = 32,
  verbose = TRUE
)
#wdv <- readRDS("data/word2vec_model.RDS") # normaly no need to run

lss <- as.textmodel_lss(wdv, seed, term)

# plot coef and terms
#cohesion(lss)
head(coef(lss), 100)
tail(coef(lss), 100)

terms <- textplot_terms(lss)
terms
ggsave(filename = "output/terms.png",
       plot = terms,
       width = 8,
       height = 5,
       units = "in",
       dpi = 600)

# evaluation of seed words
print(lss$seeds_weighted)
#bs_term <- bootstrap_lss(lss, mode = "terms")
#knitr::kable(head(bs_term, 10))
#bs_coef <- bootstrap_lss(lss, mode = "coef")
#knitr::kable(head(bs_coef, 10), digits = 3)

top_terms <- head(sort(coef(lss), decreasing = TRUE), 50)
barplot(top_terms, las = 2, col = "steelblue")


# prediction
dfmt <- dfm(toks) %>% 
  dfm_trim(min_termfreq = 10)

dfmt_group <- dfm_group(dfmt)
dat <- docvars(dfmt_group)

pred <- predict(lss, newdat = dfmt_group, density = TRUE)
dat$lss <- pred$fit
dat$density <- pred$density

saveRDS(lss, "data/lss.RDS")
saveRDS(dat, "data/data_lss.RDS")
saveRDS(wdv, "data/word2vec_model.RDS")
